考慮接下來要爬沒有 API 的頁面,以應付一些較特殊的狀況。
以下我們試圖透過「PTT股板」來探聽鄉民的風向。
www.ptt.cc
進行爬取資料import requests
import cloudscraper
from bs4 import BeautifulSoup
class PttGossiping:
def __init__(self):
payload = {
'from': '/bbs/Gossiping/index.html',
'yes': 'yes'
}
self.rs = requests.Session()
res = self.rs.post('https://www.ptt.cc/ask/over18',data=payload)
#self.scraper = cloudscraper.create_scraper(sess=self.rs)
#res = self.scraper.post('https://www.ptt.cc/ask/over18',data=payload)
self.soups = []
def fetch_article(self, fetch_link=None):
# set the fetch url then req and soup
if not fetch_link:
#fetch_link = 'https://www.ptt.cc/bbs/Gossiping/index.html'
fetch_link = 'https://www.pttweb.cc/bbs/Gossiping'
res = self.rs.get(fetch_link)
soup = BeautifulSoup(res.text, 'html.parser')
return soup
def get_next_page_link(self, origin_link):
original_soup = self.fetch_article(origin_link)
for row in original_soup.select('.btn-group-paging > .btn'):
if '上頁' in row.text:
page_link = row.get('href')
full_url = f'https://www.ptt.cc{page_link}'
return full_url
def fetch_bao_article(self, threshold=None, pages=None, fetch_link=None):
if not threshold:
threshold = 80
if not pages:
pages = 10
#if len(self.soups) < 1:
# page first
link = fetch_link
if not fetch_link:
fetch_link = 'https://www.ptt.cc/bbs/Gossiping/index.html'
print(f'Going to fetch {fetch_link}')
soup = self.fetch_article(fetch_link)
self.soups.append(soup)
# additional page 1 ~ N
for i in range(pages):
print(f'Fetching page ... {i+1}')
link = self.get_next_page_link(link)
soup = self.fetch_article(link)
self.soups.append(soup)
#print(self.soups)
results = []
for soup in self.soups:
for row in soup.select('.r-ent'):
try:
rank = row.select('.hl')[0].text
title = None
link = None
if 'X' in rank:
continue
elif rank == '爆' or (int(rank) >= threshold):
title = row.select('.title')[0].text.strip('\n')
uri = row.find_all('a')[0]['href']
link = 'https://www.ptt.cc' + uri
res = {}
title = title.replace('\u3000',' ')
res['title'] = title
res['link'] = link
res['rank'] = rank
results.append(res)
#print(res)
except IndexError:
#print('empty row ...')
pass
except Exception as e:
print(e)
return results
find_all
或是 find
來抓取特徵